?wineQualityReds
## No documentation for 'wineQualityReds' in specified packages and libraries:
## you could try '??wineQualityReds'
pf = read.csv('wineQualityReds.csv')
str(pf)
## 'data.frame': 1599 obs. of 13 variables:
## $ X : int 1 2 3 4 5 6 7 8 9 10 ...
## $ fixed.acidity : num 7.4 7.8 7.8 11.2 7.4 7.4 7.9 7.3 7.8 7.5 ...
## $ volatile.acidity : num 0.7 0.88 0.76 0.28 0.7 0.66 0.6 0.65 0.58 0.5 ...
## $ citric.acid : num 0 0 0.04 0.56 0 0 0.06 0 0.02 0.36 ...
## $ residual.sugar : num 1.9 2.6 2.3 1.9 1.9 1.8 1.6 1.2 2 6.1 ...
## $ chlorides : num 0.076 0.098 0.092 0.075 0.076 0.075 0.069 0.065 0.073 0.071 ...
## $ free.sulfur.dioxide : num 11 25 15 17 11 13 15 15 9 17 ...
## $ total.sulfur.dioxide: num 34 67 54 60 34 40 59 21 18 102 ...
## $ density : num 0.998 0.997 0.997 0.998 0.998 ...
## $ pH : num 3.51 3.2 3.26 3.16 3.51 3.51 3.3 3.39 3.36 3.35 ...
## $ sulphates : num 0.56 0.68 0.65 0.58 0.56 0.56 0.46 0.47 0.57 0.8 ...
## $ alcohol : num 9.4 9.8 9.8 9.8 9.4 9.4 9.4 10 9.5 10.5 ...
## $ quality : int 5 5 5 6 5 5 5 7 7 5 ...
summary(pf)
## X fixed.acidity volatile.acidity citric.acid
## Min. : 1.0 Min. : 4.60 Min. :0.1200 Min. :0.000
## 1st Qu.: 400.5 1st Qu.: 7.10 1st Qu.:0.3900 1st Qu.:0.090
## Median : 800.0 Median : 7.90 Median :0.5200 Median :0.260
## Mean : 800.0 Mean : 8.32 Mean :0.5278 Mean :0.271
## 3rd Qu.:1199.5 3rd Qu.: 9.20 3rd Qu.:0.6400 3rd Qu.:0.420
## Max. :1599.0 Max. :15.90 Max. :1.5800 Max. :1.000
## residual.sugar chlorides free.sulfur.dioxide
## Min. : 0.900 Min. :0.01200 Min. : 1.00
## 1st Qu.: 1.900 1st Qu.:0.07000 1st Qu.: 7.00
## Median : 2.200 Median :0.07900 Median :14.00
## Mean : 2.539 Mean :0.08747 Mean :15.87
## 3rd Qu.: 2.600 3rd Qu.:0.09000 3rd Qu.:21.00
## Max. :15.500 Max. :0.61100 Max. :72.00
## total.sulfur.dioxide density pH sulphates
## Min. : 6.00 Min. :0.9901 Min. :2.740 Min. :0.3300
## 1st Qu.: 22.00 1st Qu.:0.9956 1st Qu.:3.210 1st Qu.:0.5500
## Median : 38.00 Median :0.9968 Median :3.310 Median :0.6200
## Mean : 46.47 Mean :0.9967 Mean :3.311 Mean :0.6581
## 3rd Qu.: 62.00 3rd Qu.:0.9978 3rd Qu.:3.400 3rd Qu.:0.7300
## Max. :289.00 Max. :1.0037 Max. :4.010 Max. :2.0000
## alcohol quality
## Min. : 8.40 Min. :3.000
## 1st Qu.: 9.50 1st Qu.:5.000
## Median :10.20 Median :6.000
## Mean :10.42 Mean :5.636
## 3rd Qu.:11.10 3rd Qu.:6.000
## Max. :14.90 Max. :8.000
该数据集共有13个变量,1599条记录。其中X变量和quality变量是整型,其余11个变量均为数字。residual.sugar变量中最大值是15.5,而其第三四分位数仅为2.6,可以看做是异常值。同样可看做为异常值的还有chlorides变量、free.sulfur.dioxide变量、total.sulfur.dioxide变量、density变量、sulphates变量、alcohol变量和quality变量的最大值。
library(ggplot2)
table(pf$quality)
##
## 3 4 5 6 7 8
## 10 53 681 638 199 18
qplot(quality, data = pf)
## `stat_bin()` using `bins = 30`. Pick better value with `binwidth`.
评分为5分和6分的酒的数量明显多于其他评分的酒
library(gridExtra)
p1_1 = ggplot(aes(quality, fixed.acidity), data = pf) +
geom_point(alpha = .1, color = 'red')
p1_2 = ggplot(aes(quality, volatile.acidity), data = pf) +
geom_point(alpha = .1, color = 'blue')
p1_3 = ggplot(aes(quality, citric.acid), data = pf) +
geom_point(alpha = .1, color = 'yellow')
grid.arrange(p1_1, p1_2, p1_3, ncol = 3)
三种酸度与酒的质量并没有明显的线性关系。在不同质量的酒中,三种酸度的数值跨度都很大。但也可以看出,在质量评分为5和6分的九种,fixed.acidity的数值集中在 4-12 之间,volatile.acidity的数值集中在 0.2-0.8 之间,citric.acid的数值集中在 0.00-0.625 之间。
ggplot(aes(quality, residual.sugar), data = pf) +
geom_point(alpha = .1)
从图上看,residual.sugar的数值大部分在1-4之间。重点查看一下评分5和6的酒的residual.sugar数值。
summary(subset(pf, quality==5)$residual.sugar)
## Min. 1st Qu. Median Mean 3rd Qu. Max.
## 1.200 1.900 2.200 2.529 2.600 15.500
summary(subset(pf, quality==6)$residual.sugar)
## Min. 1st Qu. Median Mean 3rd Qu. Max.
## 0.900 1.900 2.200 2.477 2.500 15.400
p2_1 = ggplot(aes(quality, residual.sugar),
data = subset(pf, quality==5)) +
geom_boxplot(color = 'red') +
coord_cartesian(ylim = c(1,4)) +
stat_summary(fun.y = mean, geom = 'point', shape = 4)
p2_2 = ggplot(aes(quality, residual.sugar),
data = subset(pf, quality==6)) +
geom_boxplot(color = 'blue') +
coord_cartesian(ylim = c(1,4)) +
stat_summary(fun.y = mean, geom = 'point', shape = 4)
grid.arrange(p2_1, p2_2, ncol = 2)
可以看出,评分5和6的酒的residual.sugar的数值大部分在1-3.5之间,均值分别为2.529和2.477,第一四分位值和中位数完全一致。
ggplot(aes(quality, residual.sugar), data = pf) +
coord_cartesian(ylim = c(quantile(pf$residual.sugar, .05),
quantile(pf$residual.sugar, .95))) +
scale_y_continuous(breaks = seq(quantile(pf$residual.sugar, .05),
quantile(pf$residual.sugar, .95),
.2)) +
geom_point(alpha = 0.1, color = 'orange') +
geom_line(stat = 'summary', fun.y = mean) +
geom_line(stat = 'summary', fun.y = quantile, fun.args = list(prob = .1),
linetype = 2, color = 'blue') +
geom_line(stat = 'summary', fun.y = quantile, fun.args = list(prob = .5),
color = 'blue') +
geom_line(stat = 'summary', fun.y = quantile, fun.args = list(prob = .9),
linetype = 2, color = 'blue')
不同质量的酒的residual.sugar数值,中位数的均值的变化比较平缓,但是第三四分位数的变化较大。可能是因为除了评分5和6的酒,其他分值的酒样本较少,第三四分位数受极值影响较大。
library(reshape2)
summary(dcast(subset(pf, quality==5 | quality==6),
X~quality, value.var = 'chlorides'))
## X 5 6
## Min. : 1.0 Min. :0.0390 Min. :0.0340
## 1st Qu.: 382.5 1st Qu.:0.0740 1st Qu.:0.0682
## Median : 768.0 Median :0.0810 Median :0.0780
## Mean : 793.0 Mean :0.0927 Mean :0.0850
## 3rd Qu.:1219.5 3rd Qu.:0.0940 3rd Qu.:0.0880
## Max. :1599.0 Max. :0.6110 Max. :0.4150
## NA's :638 NA's :681
ggplot(aes(quality, chlorides), data = subset(pf, quality==5 | quality==6)) +
facet_wrap(~quality) +
geom_boxplot() +
coord_cartesian(ylim = c(0.03, 0.15)) +
stat_summary(fun.y = mean, geom = 'point', shape = 4)
## Warning: Continuous x aesthetic -- did you forget aes(group=...)?
可以看出,评分5和6的酒的chlorides的数值大部分在0.04-0.12之间,均值分别为0.0927和0.0850,中位数分别为0.0810和0.0780。
p3_1 = ggplot(aes(quality, free.sulfur.dioxide), data = pf) +
geom_point(alpha = .1, color = 'red')
p3_2 = ggplot(aes(quality, total.sulfur.dioxide), data = pf) +
geom_point(alpha = .1, color = 'blue')
p3_3 = ggplot(aes(quality, sulphates), data = pf) +
geom_point(alpha = .1, color = 'orange')
grid.arrange(p3_1, p3_2, p3_3, ncol = 3)
重点查看评分为5和6的酒的free.sulfur.dioxide数据
summary(dcast(subset(pf, quality==5 | quality==6),
X~quality, value.var = 'free.sulfur.dioxide'))
## X 5 6
## Min. : 1.0 Min. : 3.00 Min. : 1.00
## 1st Qu.: 382.5 1st Qu.: 9.00 1st Qu.: 8.00
## Median : 768.0 Median :15.00 Median :14.00
## Mean : 793.0 Mean :16.98 Mean :15.71
## 3rd Qu.:1219.5 3rd Qu.:23.00 3rd Qu.:21.00
## Max. :1599.0 Max. :68.00 Max. :72.00
## NA's :638 NA's :681
ggplot(aes(quality, free.sulfur.dioxide),
data = subset(pf, quality==5 | quality==6)) +
facet_wrap(~quality) +
geom_boxplot() +
coord_cartesian(ylim = c(0, 50)) +
stat_summary(fun.y = mean, geom = 'point', shape = 4)
## Warning: Continuous x aesthetic -- did you forget aes(group=...)?
可以看出,评分5和6的酒的free.sulfur.dioxide数据大部分在1-40之间,均值分别为16.98和15.71,中位数分别为15和14。
p4_1 = ggplot(aes(quality, density), data = pf) +
geom_point(alpha = .1, color = 'red')
p4_2 = ggplot(aes(quality, pH), data = pf) +
geom_point(alpha = .1, color = 'blue')
p4_3 = ggplot(aes(quality, alcohol), data = pf) +
geom_point(alpha = .1, color = 'orange')
grid.arrange(p4_1, p4_2, p4_3, ncol = 3)
评分5和6的酒,density和pH的数值分布较为接近。评分5的酒的alcohol值在9-11.5之间,评分6的酒的alcohol值在9-13之间。
ggplot(aes(fixed.acidity, volatile.acidity), data = pf) +
geom_point(alpha = .4) +
geom_smooth(method = 'lm', color = 'orange')
with(pf, cor.test(fixed.acidity, volatile.acidity))
##
## Pearson's product-moment correlation
##
## data: fixed.acidity and volatile.acidity
## t = -10.589, df = 1597, p-value < 2.2e-16
## alternative hypothesis: true correlation is not equal to 0
## 95 percent confidence interval:
## -0.3013681 -0.2097433
## sample estimates:
## cor
## -0.2561309
fixed.acidity和volatile.acidity之间存在较弱的负相关性
ggplot(aes(fixed.acidity, citric.acid), data = pf) +
geom_point(alpha = .4) +
geom_smooth(method = 'lm', color = 'orange')
with(pf, cor.test(fixed.acidity, citric.acid))
##
## Pearson's product-moment correlation
##
## data: fixed.acidity and citric.acid
## t = 36.234, df = 1597, p-value < 2.2e-16
## alternative hypothesis: true correlation is not equal to 0
## 95 percent confidence interval:
## 0.6438839 0.6977493
## sample estimates:
## cor
## 0.6717034
fixed.acidity和citric.acid之间存在较强的正相关性
ggplot(aes(volatile.acidity, citric.acid), data = pf) +
geom_point(alpha = .4) +
coord_cartesian(ylim = c(0,1)) +
geom_smooth(method = 'lm', color = 'orange')
with(pf, cor.test(volatile.acidity, citric.acid))
##
## Pearson's product-moment correlation
##
## data: volatile.acidity and citric.acid
## t = -26.489, df = 1597, p-value < 2.2e-16
## alternative hypothesis: true correlation is not equal to 0
## 95 percent confidence interval:
## -0.5856550 -0.5174902
## sample estimates:
## cor
## -0.5524957
volatile.acidity和citric.acid之间存在较强的负相关性
ggplot(aes(free.sulfur.dioxide, total.sulfur.dioxide), data = pf) +
geom_point(alpha = .2)
with(pf, cor.test(free.sulfur.dioxide, total.sulfur.dioxide))
##
## Pearson's product-moment correlation
##
## data: free.sulfur.dioxide and total.sulfur.dioxide
## t = 35.84, df = 1597, p-value < 2.2e-16
## alternative hypothesis: true correlation is not equal to 0
## 95 percent confidence interval:
## 0.6395786 0.6939740
## sample estimates:
## cor
## 0.6676665
free.sulfur.dioxide和total.sulfur.dioxide之间存在较强的正相关性
p5_1 = ggplot(aes(0.25*round(sulphates/0.25), free.sulfur.dioxide), data = pf) +
geom_line(stat = 'summary', fun.y = mean, color = 'red')
p5_2 = ggplot(aes(0.25*round(sulphates/0.25), total.sulfur.dioxide), data = pf) +
geom_line(stat = 'summary', fun.y = mean, color = 'blue')
grid.arrange(p5_1, p5_2, ncol = 2)
with(pf, cor.test(free.sulfur.dioxide, sulphates))
##
## Pearson's product-moment correlation
##
## data: free.sulfur.dioxide and sulphates
## t = 2.0671, df = 1597, p-value = 0.03888
## alternative hypothesis: true correlation is not equal to 0
## 95 percent confidence interval:
## 0.002643125 0.100424406
## sample estimates:
## cor
## 0.05165757
with(pf, cor.test(total.sulfur.dioxide, sulphates))
##
## Pearson's product-moment correlation
##
## data: total.sulfur.dioxide and sulphates
## t = 1.7178, df = 1597, p-value = 0.08602
## alternative hypothesis: true correlation is not equal to 0
## 95 percent confidence interval:
## -0.006087119 0.091774762
## sample estimates:
## cor
## 0.04294684
从相关系数上看,free.sulfur.dioxide、total.sulfur.dioxide和sulphates之间并无相关性。在增加x轴(sulphates)的组距至0.25后发现,free.sulfur.dioxide和total.sulfur.dioxide在组距0.25时的均值随sulphates的增加而增加。
ggplot(aes(alcohol, density), data = pf) +
geom_point(alpha = .25, color = 'orange') +
geom_line(stat = 'summary', fun.y = mean) +
geom_line(stat = 'summary', fun.y = median, linetype = 2)
with(pf, cor.test(alcohol, density))
##
## Pearson's product-moment correlation
##
## data: alcohol and density
## t = -22.838, df = 1597, p-value < 2.2e-16
## alternative hypothesis: true correlation is not equal to 0
## 95 percent confidence interval:
## -0.5322547 -0.4583061
## sample estimates:
## cor
## -0.4961798
alcohol与density之间存在中度负相关性。
ggplot(aes(residual.sugar, density), data = pf) +
coord_cartesian(xlim = c(quantile(pf$residual.sugar, .01),
quantile(pf$residual.sugar, .95))) +
geom_point(alpha = .25, color = 'orange') +
geom_line(stat = 'summary', fun.y = mean) +
geom_line(stat = 'summary', fun.y = median, linetype = 2)
with(pf, cor.test(residual.sugar, density))
##
## Pearson's product-moment correlation
##
## data: residual.sugar and density
## t = 15.189, df = 1597, p-value < 2.2e-16
## alternative hypothesis: true correlation is not equal to 0
## 95 percent confidence interval:
## 0.3116908 0.3973835
## sample estimates:
## cor
## 0.3552834
residual.sugar与density之间存在中度正相关性
p6_1 = ggplot(aes(pH, fixed.acidity), data = pf) +
geom_point(alpha = .25, color = 'red')
p6_2 = ggplot(aes(pH, volatile.acidity), data = pf) +
geom_point(alpha = .25, color = 'blue')
p6_3 = ggplot(aes(pH, citric.acid), data = pf) +
geom_point(alpha = .25, color = 'orange')
grid.arrange(p6_1, p6_2, p6_3, ncol = 2)
with(pf, cor.test(pH, fixed.acidity))
##
## Pearson's product-moment correlation
##
## data: pH and fixed.acidity
## t = -37.366, df = 1597, p-value < 2.2e-16
## alternative hypothesis: true correlation is not equal to 0
## 95 percent confidence interval:
## -0.7082857 -0.6559174
## sample estimates:
## cor
## -0.6829782
with(pf, cor.test(pH, volatile.acidity))
##
## Pearson's product-moment correlation
##
## data: pH and volatile.acidity
## t = 9.659, df = 1597, p-value < 2.2e-16
## alternative hypothesis: true correlation is not equal to 0
## 95 percent confidence interval:
## 0.1880823 0.2807254
## sample estimates:
## cor
## 0.2349373
with(pf, cor.test(pH, citric.acid))
##
## Pearson's product-moment correlation
##
## data: pH and citric.acid
## t = -25.767, df = 1597, p-value < 2.2e-16
## alternative hypothesis: true correlation is not equal to 0
## 95 percent confidence interval:
## -0.5756337 -0.5063336
## sample estimates:
## cor
## -0.5419041
fixed.acidity、volatile.acidity、citric.acid三项指标中,fixed.acidity与pH值的相关性最大,相关系数为-0.683。
ggplot(aes(fixed.acidity, pH), data = pf) +
geom_point(aes(color = density))
ggplot(aes(fixed.acidity, pH), data = pf) +
geom_line(aes(color = density), stat = 'summary', fun.y = median)
可以看出,pH值的中位数随fixed.acidity值的增加而降低。密度较小的酒pH值一般较高,密度较大的酒一般pH值较低。
ggplot(aes(residual.sugar, pH), data = pf) +
coord_cartesian(xlim = c(quantile(pf$residual.sugar, .05),
quantile(pf$residual.sugar, .95))) +
geom_point(alpha = .5)
with(pf, cor.test(residual.sugar, pH))
##
## Pearson's product-moment correlation
##
## data: residual.sugar and pH
## t = -3.4355, df = 1597, p-value = 0.0006066
## alternative hypothesis: true correlation is not equal to 0
## 95 percent confidence interval:
## -0.13411046 -0.03678574
## sample estimates:
## cor
## -0.08565242
pH值与residual.sugar没有相关性。
ggplot(aes(quality), data = pf) +
geom_histogram(binwidth = .5) +
scale_x_continuous(breaks = seq(3,8,1)) +
xlab('Quality Score') +
ylab('Number of Sample') +
labs(title = 'Distribution of Quality Score') +
theme(plot.title = element_text(hjust = 0.5)) #标题居中
样本中评分为5和6的酒的数量最多,数量远大于其他分值的酒。
p2_1 = ggplot(aes(quality, residual.sugar),
data = subset(pf, quality==5)) +
geom_boxplot(color = 'red') +
xlab('') +
ylab('') +
coord_cartesian(ylim = c(1,4)) +
scale_y_continuous(breaks = seq(1,4,.1)) +
scale_x_continuous(breaks = seq(4.5,5.5,.5)) +
stat_summary(fun.y = mean, geom = 'point', shape = 4)
p2_2 = ggplot(aes(quality, residual.sugar),
data = subset(pf, quality==6)) +
geom_boxplot(color = 'blue') +
xlab('') +
ylab('') +
coord_cartesian(ylim = c(1,4)) +
scale_y_continuous(breaks = seq(1,4,.1)) +
scale_x_continuous(breaks = seq(5.5,6.5,.5)) +
stat_summary(fun.y = mean, geom = 'point', shape = 4)
grid.arrange(p2_1, p2_2, ncol = 2,
left = 'Residual.sugar Value',
top = 'Distribution of Residual.sugar with Score 5 and 6',
bottom = 'Quality Score')
residual.sugar的数值大部分在1-3.5之间,均值分别为2.529和2.477,第一四分位值和中位数完全一致。
ggplot(aes(fixed.acidity, pH), data = pf) +
geom_line(aes(color = density), stat = 'summary', fun.y = median) +
scale_x_continuous(breaks = seq(6, 16, 2)) +
labs(x = 'Fixed.acidity Value',
y = 'pH Value',
title = 'Relationship of Fixed.acidity and pH with Different Density') +
theme(plot.title = element_text(hjust = 0.5))
pH值的中位数随fixed.acidity值的增加而降低。密度较小的酒pH值一般较高,密度较大的酒一般pH值较低。
开始时,我认为数据间会存在明显的相关性,但随着分析的深入,发现相关性并没有那么明显。例如,我在分析residual.sugar和pH的关系时,开始时认为residual.sugar的值越小,pH也应该越小,但通过分析发现,二者并没有相关性。 在未来的分析中,应当注意不要在分析数据前就形成先入为主的想法,容易造成方向错误。